3251_Project_Group2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
#JG added:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
#import category_encoders as ce
from patsy import dmatrices
import scipy as sp
from statsmodels.stats.outliers_influence import variance_inflation_factor
names = ['customer', 'state', 'clv', 'response', 'coverage_type', 'education', 'effective_to_date', 'employment_status', 'gender', 'income' , 'location_code', 'marital_status', 'monthly_premium', 'months_since_last_claim', 'months_since_policy_start', 'open_complaints', 'num_policies', 'policy_type', 'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size']
insurance= pd.read_csv('https://raw.githubusercontent.com/jgoodz8/3251_Project_Group2/master/AutoInsurance.csv',\
sep=",",header=0,names=names)
insurance['clv']=insurance['clv'].round(2)
insurance['effective_to_date'] = pd.to_datetime(insurance['effective_to_date'])
insurance['total_claim_amount'] = insurance['total_claim_amount'].round(2)
Showcases the correlation between the numeric variables
corr = insurance.corr()
_ , ax = plt.subplots(figsize =(10, 10))
colormap = sns.diverging_palette(220, 10, as_cmap = True)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
_ = sns.heatmap(round(corr,2),
cmap = colormap,
square=True,
cbar_kws={'shrink':.6 },
ax=ax,
annot=True,
linewidths=0.1,vmax=1.0, linecolor='white',
annot_kws={'fontsize':12 },
mask=mask)
plt.title('Insurance Attributes Correlation Heatmap', y=1.05, size=15)
fig = go.Figure(data=go.Choropleth(
locations=["AZ","CA","NV","OR" ,"WA"], # Spatial coordinates
z = insurance.groupby(['state']).sum()['clv'].astype(float), # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = 'Reds',
colorbar_title = "Millions USD",
))
fig.update_layout(
title_text = 'Total Customer Life Value By State',
geo_scope='usa', # limite map scope to USA
)
fig.show()
Customers from California provide the most amount business to the company (around $25 million).
Customers from Washington provide the least amount of business to the company (around $6.4 million)
Visualize the total value of the claims made by the location type
plt.figure(figsize=(10,6))
sns.boxplot(x='location_code',y='total_claim_amount',data=insurance,showfliers=False)
plt.show()
From this we can conclude the following:
Suburban insurance policies have the highest claim amounts with a median value of ~ $500.
Rural locations have the lowest claim amounts with a median value of ~ $100.
There exists a stark difference $400 between Rural and suburban insurance customers.
plt.figure(figsize=(10,6))
state=insurance.groupby(['state'])['total_claim_amount'].mean()
state.plot(kind='bar',ylim=(415,450),color='b',alpha=0.2)
plt.title("Average Claim by State",fontsize=20)
plt.grid('True')
plt.show()
plt.figure(figsize=(10,6))
plt.scatter(insurance['clv'], insurance['total_claim_amount'],alpha=0.2)
plt.xlabel('Customer Life Time Value')
plt.ylabel('Total Claim amount')
plt.show()
customerState = insurance.groupby(['state'])['clv','total_claim_amount'].sum()
customerState['state']=customerState.index.values
fig = px.scatter(customerState, x=customerState['total_claim_amount'], y=customerState['clv'], size=customerState['clv'], color=customerState['state'], hover_name=customerState['state'], size_max=60)
fig.show()
plt.figure(figsize=(10,6))
plt.scatter(insurance['income'], insurance['total_claim_amount'],alpha= 0.2)
plt.xlabel('Income')
plt.ylabel('Total Claim amount')
plt.show()
plt.figure(figsize=(10,6))
plt.scatter(insurance['monthly_premium'], insurance['total_claim_amount'],alpha=0.2)
plt.xlabel('Monthly Premium')
plt.ylabel('Total Claim amount')
plt.show()
#Education
plt.figure(figsize=(10,6))
sns.boxplot(x='education', y= 'total_claim_amount', data= insurance,showfliers=False)
plt.xlabel('Education')
plt.ylabel('Total Claim amount')
plt.show()
#Education
plt.figure(figsize=(10,6))
sns.boxplot(x='open_complaints', y= 'total_claim_amount', data= insurance,showfliers=False)
plt.xlabel('Open Complaints')
plt.ylabel('Total Claim amount')
plt.show()
plt.figure(figsize=(10,6))
sns.boxplot(x='coverage_type', y= 'total_claim_amount', data= insurance,showfliers=False)
plt.xlabel('Coverage Type',fontsize=16)
plt.ylabel('Total Claim amount')
plt.show()
#Gender - Total claims by male is slightly higher than claims by female
gender = pd.DataFrame(insurance.groupby(['gender'])['total_claim_amount'].sum())
gender.plot(kind='bar',ylim=1800000,figsize=(10,6))
plt.xlabel('Gender',fontsize=16)
plt.ylabel('Total Claim amount')
plt.show()
insurance_cont = insurance.select_dtypes([np.int64, np.float])
for col in insurance_cont.columns:
p = sns.distplot(insurance[col], kde = False)
plt.show()
for i in insurance_cont.columns:
insurance_cont.boxplot(column=i,showfliers=False)
plt.show()
fig, ax = plt.subplots(3,2,figsize=(50,50))
#Customers across state
customersByState = insurance.groupby(['state'])['customer'].count()
ax[0,0].pie(customersByState, labels=customerState.index.values, autopct='%1.1f%%', shadow=True, startangle=90,textprops={'fontsize': 40})
ax[0,0].set_title("State",fontsize=60)
#Customers by sales_channel
customersBySalesChannel = insurance.groupby(['sales_channel'])['customer'].count()
ax[0,1].pie(customersBySalesChannel, labels=customersBySalesChannel.index.values, autopct='%1.1f%%', shadow=True, startangle=90,textprops={'fontsize': 40})
ax[0,1].set_title("Sales Channels",fontsize=60)
#Customers by employment status
customersByEmploymentStatus = insurance.groupby(['employment_status'])['customer'].count()
ax[1,0].pie(customersByEmploymentStatus, labels=customersByEmploymentStatus.index.values, autopct='%1.1f%%', shadow=True, startangle=90,textprops={'fontsize': 40})
ax[1,0].set_title("Employment Status",fontsize=60)
#Customers by policy_type
customersByPolicyType = insurance.groupby(['policy_type'])['customer'].count()
ax[1,1].pie(customersByPolicyType, labels=customersByPolicyType.index.values, autopct='%1.1f%%', shadow=True, startangle=90,textprops={'fontsize': 40})
ax[1,1].set_title("Policy Types",fontsize=60)
#Customers by marital_status
customersByMaritalStatus = insurance.groupby(['marital_status'])['customer'].count()
ax[2,0].pie(customersByMaritalStatus, labels=customersByMaritalStatus.index.values, autopct='%1.1f%%', shadow=True, startangle=90,textprops={'fontsize': 40})
ax[2,0].set_title("Marital Status",fontsize=60)
#Customers by education
customersByEducation = insurance.groupby(['education'])['customer'].count()
ax[2,1].pie(customersByEducation, labels=customersByEducation.index.values, autopct='%1.1f%%', shadow=True, startangle=90,textprops={'fontsize': 40})
ax[2,1].set_title("Education",fontsize=60)
insurance_cat = insurance.select_dtypes([np.object]).drop(columns='customer')
for col in insurance_cat.columns:
sns.factorplot(x = col, y = 'clv', kind = 'box', data = insurance, size=4, aspect=2,showfliers=False)
for col in insurance_cat.columns:
sns.factorplot(x = col, y = 'clv', kind = 'violin', data = insurance, size=4, aspect=2,showfliers=False)
#cont_col = ['Customer Lifetime Value', 'Income', 'Months Since Policy Inception', 'Number of Open Complaints', 'Number of Policies', 'Total Claim Amount']
for col in insurance_cont.columns:
sns.lmplot(x=col, y="clv", data=insurance)
#Using dmatrices, automatically encodes dummies
#Below didn't work, I don't think dmatrices will take a df
#dy = insurance['Customer Lifetime Value']
#dX = insurance.drop(columns=['Customer', 'Effective To Date', 'Customer Lifetime Value'])
y, X = dmatrices('clv ~ state + coverage_type + education + employment_status + gender + income + location_code + marital_status + monthly_premium + months_since_last_claim + months_since_policy_start + open_complaints + num_policies + sales_channel + vehicle_class + vehicle_size',
data=insurance, return_type='dataframe')
#If using pd dummies above
#X = reg_df.drop(columns=['clv'])
#y = reg_df['clv']
## fit a OLS model
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
model.summary()
#Low Rsquared at 0.16, only 16% of variance explained by model
#very low p value for omnibus and JB test suggests non-normal distribution
#diagnostic plots in python: https://robert-alvarez.github.io/2018-06-04-diagnostic_plots/
model_fitted_y = model.fittedvalues
# model residuals
model_residuals = model.resid
# normalized residuals
model_norm_residuals = model.get_influence().resid_studentized_internal
# absolute squared normalized residuals
model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
# absolute residuals
model_abs_resid = np.abs(model_residuals)
# leverage, from statsmodels internals
model_leverage = model.get_influence().hat_matrix_diag
# cook's distance, from statsmodels internals
model_cooks = model.get_influence().cooks_distance[0]
plot_lm_1 = plt.figure()
plot_lm_1.axes[0] = sns.residplot(model_fitted_y, insurance.clv, data=insurance,
lowess=True,
scatter_kws={'alpha': 0.7},
line_kws={'color': 'red', 'lw': 2, 'alpha': 0.6})
plot_lm_1.axes[0].set_title('Residuals vs Fitted')
plot_lm_1.axes[0].set_xlabel('Fitted values')
plot_lm_1.axes[0].set_ylabel('Residuals')
#not equal variance, appears heteroskedastic
name = ['Lagrange multiplier statistic', 'p-value',
'f-value', 'f p-value']
test = sms.het_breuschpagan(model_residuals, model.model.exog)
lzip(name, test)
#highlight significant results further suggest heteroskedasticity
plt.scatter(model_fitted_y, insurance.clv)
plt.xlabel("Predicted CLV by Model")
plt.ylabel("Actual CLV")
plt.title("Predictions vs. Original CLV")
#tends to
model_residuals.hist()
#skewed right tail
pred_val = model.fittedvalues.copy()
true_val = insurance['clv'].values.copy()
residual = true_val - pred_val
fig, ax = plt.subplots(figsize=(6,2.5))
_, (__, ___, r) = sp.stats.probplot(residual, plot=ax, fit=True)
#skewed right tailed
fig, ax = plt.subplots(figsize=(15,3))
fig = sm.graphics.plot_leverage_resid2(model, ax=ax)
fig, ax = plt.subplots(figsize=(15,6))
fig = sm.graphics.influence_plot(model, ax=ax,criterion='cooks')
# For each X, calculate VIF and save in dataframe (>5 is considered colinear)
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif.round(1)
#Monthly premium is highlight colinear with customer lifetime value. This makes sense as the more the customer pays the higher the higher the value of the customer. It seems this is not cancelled out by higher claims
#Intercept is highly colinear, this is abnormal.
#model not improved without monthly premium
X = X.drop(columns=['monthly_premium'])
model = sm.OLS(y, X).fit()
model.summary()
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif.round(1)
y, X = dmatrices('clv ~ state + coverage_type + education + employment_status + gender + income + location_code + marital_status + monthly_premium + months_since_last_claim + months_since_policy_start + open_complaints + num_policies + sales_channel + vehicle_class + vehicle_size',
data=insurance, return_type='dataframe')
y['clv'] = np.log(y['clv']+1)
#Coudn't make formula work, probably incorrect anyways: X[['income', 'monthly_premium', 'months_since_last_claim', 'months_since_policy_start','num_policies']].apply(lambda x:np.log(x) if column.value > 0 else np.log(x),axis=1)
model_trans = sm.OLS(y, X).fit()
model_trans.summary()
#log improved R2 to 0.25 but still non-normal
model_trans_fitted_y = model_trans.fittedvalues
# model residuals
model_trans_residuals = model_trans.resid
# normalized residuals
model_trans_norm_residuals = model_trans.get_influence().resid_studentized_internal
# absolute squared normalized residuals
model_trans_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_trans_norm_residuals))
# absolute residuals
model_trans_abs_resid = np.abs(model_trans_residuals)
# leverage, from statsmodels internals
model_trans_leverage = model_trans.get_influence().hat_matrix_diag
# cook's distance, from statsmodels internals
model_trans_cooks = model_trans.get_influence().cooks_distance[0]
plot_lm_1 = plt.figure()
plot_lm_1.axes[0] = sns.residplot(model_trans_fitted_y, insurance.clv, data=insurance,
lowess=True,
scatter_kws={'alpha': 0.7},
line_kws={'color': 'red', 'lw': 2, 'alpha': 0.6})
plot_lm_1.axes[0].set_title('Residuals vs Fitted for transformed model')
plot_lm_1.axes[0].set_xlabel('Fitted values')
plot_lm_1.axes[0].set_ylabel('Residuals')
model_trans_residuals.hist()
#skewed right tail
pred_val = model_trans.fittedvalues.copy()
true_val = insurance['clv'].values.copy()
residual = true_val - pred_val
fig, ax = plt.subplots(figsize=(6,2.5))
_, (__, ___, r) = sp.stats.probplot(residual, plot=ax, fit=True)
#skewed right tailed
#model by forward selection
def stepwise_selection(X, y,
initial_list=[],
threshold_in=0.05,
threshold_out = 0.05,
verbose=True):
""" Perform a forward-backward feature selection
based on p-value from statsmodels.api.OLS
Arguments:
X - pandas.DataFrame with candidate features
y - list-like with the target
initial_list - list of features to start with (column names of X)
threshold_in - include a feature if its p-value < threshold_in
threshold_out - exclude a feature if its p-value > threshold_out
verbose - whether to print the sequence of inclusions and exclusions
Returns: list of selected features
Always set threshold_in < threshold_out to avoid infinite looping.
See https://en.wikipedia.org/wiki/Stepwise_regression for the details
"""
included = list(initial_list)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.idxmin()
included.append(best_feature)
changed=True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed=True
worst_feature = pvalues.argmax()
included.remove(worst_feature)
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
return included, model.summary()
result = stepwise_selection(X, y)
print('resulting features:')
print(result)
#no predictive improvement with reduced model...
z, D = dmatrices('total_claim_amount ~ state + coverage_type + education + employment_status + gender + income +location_code + marital_status + months_since_last_claim + months_since_policy_start + open_complaints + num_policies + sales_channel + vehicle_class + vehicle_size',
data= df_work,
return_type='dataframe')
# Fit model - considering all the variables
MLR = sm.OLS(z,D).fit()
# Summarize model
MLR.summary()
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('https://raw.githubusercontent.com/jgoodz8/3251_Project_Group2/master/AutoInsurance.csv')
data.columns = ['customer', 'state', 'clv', 'response', 'coverage_type', 'education', 'effective_to_date',\
'employment_status', 'gender', 'income' , 'location_code', 'marital_status', 'monthly_premium',\
'months_since_last_claim', 'months_since_policy_start', 'open_complaints', 'num_policies',\
'policy_type', 'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount',\
'vehicle_class', 'vehicle_size']
## Allows us to display all the columns in one go
pd.options.display.max_columns = None
data.head(5)
Using Logistic regression, Predict the response received through the marketing campaign.
Using the following variables
A basic histogram which showcase the 'distribution' of what the numeric variables look like
num_vars= ['monthly_premium','income']
data[num_vars].hist(figsize=(10,6))
plt.show()
A basic visualization of the distribution and counts of different options in the categorical fields.
cat_vars = ['employment_status', 'gender', 'sales_channel', 'renew_offer_type']
sns.catplot('renew_offer_type',kind="count",data=data)
sns.catplot('sales_channel',kind="count",data=data)
sns.catplot('gender',kind="count",data=data)
sns.catplot('employment_status',kind="count",data=data)
plt.show()
A basic visualization of the distribution and counts of the Response variable
sns.catplot('response',kind="count",data=data)
plt.show()
We can see that we have a class imbalance.
~80% of the response variable is a 'No'.
By 'cheating' we can claim to achieve ~80% accuracy of our model by simply setting all of the responses to 'No'.
In order to avoid this bias, We'll use SMOTE (Synthetic minority oversampling technique) to oversample the "Yes" response in the training data.
data_cat= pd.get_dummies(data[cat_vars])
data_final = data[num_vars].join(data_cat)
data_final['binary_response'] = np.where(data['response']=="Yes",1,0)
data_final.head(3)
X= Predictors
y= Response
X = data_final.loc[:, data_final.columns != 'binary_response']
y = data_final.loc[:, data_final.columns == 'binary_response']
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
As discussed in 1.3, use SMOTE (Synthetic minority oversampling technique) to oversample the "Yes" response in the training data
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_data_X, os_data_y = os.fit_sample(X_train, y_train)
columns = X_train.columns
os_data_X = pd.DataFrame(data=os_data_X,columns=columns)
os_data_y = pd.DataFrame(data=os_data_y,columns=['binary_response'])
## Check the data
print("OS data length ",len(os_data_X))
print("OS 'No' response ", len(os_data_y[os_data_y['binary_response']==0]))
print("OS 'Yes' response ", len(os_data_y[os_data_y['binary_response']==1]))
from statsmodels.discrete.discrete_model import Logit
from scipy.special import logit
os_data_y = np.ravel(os_data_y)
X_train = os_data_X
y_train = os_data_y
model = Logit(y_train,X_train)
model=model.fit(maxiter=100)
print(model.summary2())
Conclusion:
Based on the model summary,
The following can be concluded:
Derive the binary estimates $\hat{y}$ of the response $y$.
A decision boundary of 0.5 has been used, i.e A threshold value above which the value is classified as 1 and below it it classified as 0.
To summarize:
p >= 0.5, $\hat{y}$ = 1
p < 0.5, $\hat{y}$ = 0
def predict(df,thresh=0.5):
y_hat_prob= model.predict(df)
y_hat=(y_hat_prob >= thresh) * 1
return y_hat
y_hat= predict(X_test)
cnf_matrix=metrics.confusion_matrix(y_hat,y_test)
class_names=[0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="Blues" ,fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix', y=10,fontsize=18)
plt.ylabel('Actual label',fontsize=18)
plt.xlabel('Predicted label',fontsize=18)
Precision: The ability of the classifier NOT to label as positive a sample that is negative.
Recall: The ability of the classifier to find all the positive samples.
From the report below we can see that recall is 70%. This means that out of all the customers that responded "Yes", we managed to label 70% of them correctly using the predictor variables we selected.
Precision average is also high at 82%
from sklearn.metrics import classification_report
print(classification_report(y_test, y_hat))
We can see that the Area Under the Curve (AUC) is 0.68 which means that the model is correctly predicting the customers' response approx 70% of the time.
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict(X_test))
plt.figure(figsize=(10,6))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
We examine the relationship between threshold and precision of the model
## Returns the accuracy of the true positive
def true_pos(cm):
num= cm.iloc[1,1]
denom = np.sum(np.sum(cm))
out= num/denom
return out
def accuracy(predicted,actual):
diff= predicted - actual
total = len(diff)
incorrect= np.count_nonzero(diff)
result= (total - incorrect) / total
return result
decision_boundaries= np.arange(0,1,0.01)
accuracies=[]
db=[]
for i in range(len(decision_boundaries)):
try:
p= decision_boundaries[i]
y_hat= predict(X_test,thresh=p)
cm= pd.crosstab(y_hat,y_test)
accuracies.append(true_pos(cm))
db.append(p)
except:
pass
accuracies_2=[]
for i in range(len(decision_boundaries)):
p = decision_boundaries[i]
y_hat= predict(X_test,thresh=p)
accuracies_2.append(accuracy(y_hat,y_test))
plt.figure(figsize=(10,6))
plt.plot(decision_boundaries,accuracies_2)
plt.plot(db,accuracies)
plt.xlabel('Boundary Threshold',fontsize=12)
plt.ylabel('Model Accuracy in %',fontsize=12)
#plt.axvline(0.23, color='red')
plt.legend(('Overall Model accuracy','True Positive accuracy','Optimum Point'),fontsize=12)
plt.show()
Conclusion:
This graph visualizes an interesting tradeoff.
As we increase the boundary threshold, we increase the overall model accuracy at the expense of True positive accuracy.
Although no ideal optimum points exist (we used a 0.5 threshold previously), we can use this as a guideline to fine tune our desired outcome.